filter out IE = 0 (missing value)
filter out testelapse value in the top or bottom 1% (remove outliers in people spend too little or too much time on the test)
ggplot(data=data,aes(IE)) + geom_histogram() +theme_classic() + ggtitle('IE value distribution')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=data,aes(testelapse)) + geom_boxplot() +theme_classic() + ggtitle('Test Elapse and Outliers')
# filter with testlapse in 1% and 99% quantile
data_filter <- data %>%
filter(IE!=0 & testelapse>= quantile(testelapse,0.01) & testelapse <= quantile(testelapse,0.99) )
data_final <- data_filter %>%
select(ends_with("A"),IE,age,gender)
1.independent variables: answers for 91 questions (categorical,1,2,3,4,5)
2.dependent variables: introvert/extrovert (categorical, 1= yes, 0 = no/not sure)
data_final <- data_final%>%
mutate(IE_extrovert=ifelse(IE==2,1,0),
IE_introvert=ifelse(IE==1,1,0))
head(data_final[,1:90])
## Q1A Q2A Q3A Q4A Q5A Q6A Q7A Q8A Q9A Q10A Q11A Q12A Q13A Q14A Q15A Q16A Q17A
## 1 5 3 1 2 3 2 3 3 4 5 1 5 3 5 1 3 3
## 2 5 5 1 5 2 2 5 2 1 3 2 2 3 3 2 2 4
## 3 3 4 5 3 4 5 5 5 5 5 4 5 4 1 1 1 1
## 4 5 2 1 1 5 5 5 4 4 2 5 5 2 1 1 1 2
## 5 1 2 1 1 3 3 5 1 3 4 1 4 5 2 1 5 3
## 6 2 5 5 1 2 4 5 2 4 4 1 4 2 2 1 2 2
## Q18A Q19A Q20A Q21A Q22A Q23A Q24A Q25A Q26A Q27A Q28A Q29A Q30A Q31A Q32A
## 1 4 1 4 1 3 5 1 4 5 5 4 1 2 2 2
## 2 5 2 2 4 4 3 5 5 5 1 5 2 2 3 5
## 3 1 2 1 3 2 5 4 4 4 5 2 4 5 4 5
## 4 4 1 1 1 4 2 2 3 4 3 2 5 5 2 5
## 5 5 5 5 5 4 5 5 5 5 4 2 1 1 4 1
## 6 2 1 1 4 4 2 1 5 3 2 1 5 1 2 1
## Q33A Q34A Q35A Q36A Q37A Q38A Q39A Q40A Q41A Q42A Q43A Q44A Q45A Q46A Q47A
## 1 4 5 4 2 1 1 5 2 5 4 5 5 1 5 2
## 2 2 2 4 5 4 4 3 3 1 3 4 1 1 4 2
## 3 2 2 3 5 1 4 2 4 4 4 5 5 5 4 3
## 4 4 2 2 3 2 5 1 5 4 5 3 5 2 4 1
## 5 4 4 4 5 2 5 4 3 2 4 5 3 1 5 1
## 6 5 4 4 1 1 4 1 5 5 2 4 3 4 2 1
## Q48A Q49A Q50A Q51A Q52A Q53A Q54A Q55A Q56A Q57A Q58A Q59A Q60A Q61A Q62A
## 1 4 5 3 1 1 5 3 1 4 4 5 2 4 1 2
## 2 2 4 2 2 1 1 1 3 1 2 1 3 1 1 4
## 3 1 1 5 5 5 4 3 5 4 5 4 5 4 5 3
## 4 4 1 4 5 1 1 5 1 4 4 2 5 5 4 1
## 5 4 3 1 2 1 2 3 1 1 5 1 2 2 5 5
## 6 1 1 4 1 1 1 2 5 5 1 2 5 1 4 2
## Q63A Q64A Q65A Q66A Q67A Q68A Q69A Q70A Q71A Q72A Q73A Q74A Q75A Q76A Q77A
## 1 2 1 3 1 4 1 1 5 1 5 3 2 5 5 3
## 2 5 1 5 1 1 1 1 5 1 5 3 2 5 2 4
## 3 2 5 2 5 1 4 1 5 1 1 1 3 4 1 1
## 4 1 2 4 2 1 1 5 5 4 1 3 2 4 4 1
## 5 4 1 2 2 5 4 2 5 4 3 1 5 5 5 5
## 6 1 5 5 1 5 4 1 1 1 4 2 1 5 1 4
## Q78A Q79A Q80A Q81A Q82A Q83A Q84A Q85A Q86A Q87A Q88A Q89A Q90A
## 1 5 3 4 2 1 3 2 1 4 2 5 4 3
## 2 2 5 4 2 1 2 2 2 1 3 4 4 4
## 3 3 4 1 5 5 5 5 5 4 5 3 2 1
## 4 4 4 1 5 5 5 5 5 3 5 4 4 3
## 5 4 4 5 3 2 3 1 1 3 1 2 5 5
## 6 2 2 1 5 2 2 3 1 3 1 2 2 2
1.Boxplot for values of each question
2.Heatmap for correlation matrix
# boxplot for each question
data_final %>% gather(Question_number,Question_answer,starts_with('Q')[1:20]) %>%
ggplot(aes(Question_answer,col=Question_number)) + geom_boxplot() +
facet_wrap(.~Question_number) + theme_classic()
data_final %>% gather(Question_number,Question_answer,starts_with('Q')[21:40]) %>%
ggplot(aes(Question_answer,col=Question_number)) + geom_boxplot() +
facet_wrap(.~Question_number) + theme_classic()
data_final %>% gather(Question_number,Question_answer,starts_with('Q')[41:60]) %>%
ggplot(aes(Question_answer,col=Question_number)) + geom_boxplot() +
facet_wrap(.~Question_number) + theme_classic()
data_final %>% gather(Question_number,Question_answer,starts_with('Q')[61:80]) %>%
ggplot(aes(Question_answer,col=Question_number)) + geom_boxplot() +
facet_wrap(.~Question_number) + theme_classic()
data_final %>% gather(Question_number,Question_answer,starts_with('Q')[81:91]) %>%
ggplot(aes(Question_answer,col=Question_number)) + geom_boxplot() +
facet_wrap(.~Question_number) + theme_classic()
# heatmap
cormat <- data_final %>%
select(starts_with('Q')) %>%
cor()
heatmap(cormat)
train <- data_final[1:5000,]
test <- data_final[5000:nrow(data_final),]
# multiple logistic regression for extrovert question selection
logistic_ex_reg <- function(x){
summary(glm(data=train,IE_extrovert~x,family='binomial'))$coefficient[2,]
}
coef <- apply(train[,1:91],2,logistic_ex_reg)[1,]
se <- apply(train[,1:91],2,logistic_ex_reg)[2,]
pval <- apply(train[,1:91],2,logistic_ex_reg)[4,]
results <- data.frame(cbind(coef,se,pval))
results$Q <- 1:91
# multiple logistic regression for introvert question selection
logistic_in_reg <- function(x){
summary(glm(data=train,IE_introvert~x,family='binomial'))$coefficient[2,]
}
coef_in <- apply(train[,1:91],2,logistic_in_reg)[1,]
se_in <- apply(train[,1:91],2,logistic_in_reg)[2,]
pval_in <- apply(train[,1:91],2,logistic_in_reg)[4,]
results_in <- data.frame(cbind(coef_in,se_in,pval_in))
results_in$Q <- 1:91
# Volcano plot ------------------------------------------------
#x:log(coef)
#y:-log(SE)
#test size:(1/se^2)
pval_bonf = 0.05/nrow(results)
library(ggthemes)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
p <- results %>%
ggplot(aes(x = coef,y = -log10(pval),col=Q,label=rownames(results))) +
geom_point(size=1.5) +
geom_hline(yintercept = -log10(pval_bonf),
col = 'grey40',
linetype = 'dotted') +
annotate('text',x=-1.0, y= -log10(pval_bonf + 0.000001),
label=paste('p=' ,pval_bonf),
size=2,col='grey30') +
theme_few()+
ggtitle('Questions for extrovert determination')
p + geom_text(check_overlap = TRUE,aes(size= 1/se^2))
ggplotly(p)
p2 <- results_in %>%
ggplot(aes(x = coef_in,y = -log10(pval_in),col=Q,label=rownames(results_in))) +
geom_point(size=1.5) +
geom_hline(yintercept = -log10(pval_bonf),
col = 'grey40',
linetype = 'dotted') +
annotate('text',x=-0.5, y= -log10(pval_bonf + 0.000001),
label=paste('p=' ,pval_bonf),
size=2,col='grey30') +
theme_few()+
ggtitle('Questions for introvert determination')
p2 + geom_text(check_overlap = TRUE,aes(size= 1/se_in^2))
ggplotly(p2)
for extrovert: Q80A+Q81A+Q83A+Q84A+Q91A for introvert: Q80A+Q82A+Q84A+Q89A+Q90A+Q91A
# logistic regression for predicting
model_ex <- glm(data=train, IE_extrovert ~ Q80A+Q81A+Q83A+Q84A+Q91A,family='binomial')
summary(model_ex)
##
## Call:
## glm(formula = IE_extrovert ~ Q80A + Q81A + Q83A + Q84A + Q91A,
## family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0840 -0.3654 -0.1732 -0.0855 3.3509
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.47658 0.29109 -1.637 0.102
## Q80A 0.33885 0.04290 7.899 2.81e-15 ***
## Q81A -0.36180 0.04553 -7.947 1.92e-15 ***
## Q83A -0.51117 0.05388 -9.488 < 2e-16 ***
## Q84A -0.30194 0.04737 -6.375 1.83e-10 ***
## Q91A 0.40154 0.04890 8.212 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3976.4 on 4999 degrees of freedom
## Residual deviance: 2349.8 on 4994 degrees of freedom
## AIC: 2361.8
##
## Number of Fisher Scoring iterations: 6
model_in <- glm(data=train, IE_introvert ~ Q80A+Q82A+Q84A+Q89A+Q90A+Q91A,family='binomial')
summary(model_in)
##
## Call:
## glm(formula = IE_introvert ~ Q80A + Q82A + Q84A + Q89A + Q90A +
## Q91A, family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5699 -0.6675 0.3401 0.6842 2.4967
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.16378 0.24782 4.696 2.65e-06 ***
## Q80A -0.29400 0.03110 -9.454 < 2e-16 ***
## Q82A 0.44347 0.03151 14.075 < 2e-16 ***
## Q84A 0.17068 0.03629 4.703 2.56e-06 ***
## Q89A -0.15918 0.03385 -4.703 2.57e-06 ***
## Q90A -0.22090 0.03507 -6.299 3.00e-10 ***
## Q91A -0.29581 0.03377 -8.761 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6664.6 on 4999 degrees of freedom
## Residual deviance: 4535.3 on 4993 degrees of freedom
## AIC: 4549.3
##
## Number of Fisher Scoring iterations: 5
Model for probability of extrovert: log(p/1-p) = -0.47658 + 0.33885* Q80A -0.36180*Q81A -0.51117*Q83A -0.30194*Q84A + 0.40154*Q91A
Model for probability of introvert: log(p/1-p) = 1.16378 -0.29400* Q80A + 0.44347*Q82A +0.17068*Q84A -0.15918*Q89A -0.22090*Q90A -0.0.29581*Q91A
test$predict_ex <- ifelse(predict(model_ex,newdata = test,type='response')>0.5,1,0)
test$predict_in <- ifelse(predict(model_in,newdata = test,type='response')>0.5,1,0)
#predict probability
accuracy_extrovert <- mean(test$IE_extrovert == test$predict_ex)
paste('The accuracy of our extrovert predicting model is ', round(accuracy_extrovert*100,digits=3),'%',sep='')
## [1] "The accuracy of our extrovert predicting model is 90.45%"
accuracy_introvert <- mean(test$IE_introvert == test$predict_in)
paste('The accuracy of our introvert predicting model is ', round(accuracy_introvert*100,digits=3),'%',sep='')
## [1] "The accuracy of our introvert predicting model is 80.604%"
## parallel connection
test$predict_ex_par <- ifelse(predict(model_ex,newdata = test,type='response')>0.5 |
predict(model_in,newdata = test,type='response')<0.5,1,0)
test$predict_in_par <- ifelse(predict(model_ex,newdata = test,type='response')<0.5 |
predict(model_in,newdata = test,type='response')>0.5,1,0)
accuracy_extrovert_par <- mean(test$IE_extrovert == test$predict_ex_par)
paste('The accuracy of our parallel connection extrovert predicting model is ', round(accuracy_extrovert_par*100,digits=3),'%',sep='')
## [1] "The accuracy of our parallel connection extrovert predicting model is 76.447%"
accuracy_introvert_par <- mean(test$IE_introvert == test$predict_in_par)
paste('The accuracy of our parallel connection introvert predicting model is ', round(accuracy_introvert_par*100,digits=3),'%',sep='')
## [1] "The accuracy of our parallel connection introvert predicting model is 71.747%"
## series connection
test$predict_ex_ser <- ifelse(predict(model_ex,newdata = test,type='response')>0.5 &
predict(model_in,newdata = test,type='response')<0.5,1,0)
test$predict_in_ser <- ifelse(predict(model_ex,newdata = test,type='response')<0.5 &
predict(model_in,newdata = test,type='response')>0.5,1,0)
accuracy_extrovert_ser <- mean(test$IE_extrovert == test$predict_ex_ser)
paste('The accuracy of our series connection extrovert predicting model is ', round(accuracy_extrovert_ser*100,digits=3),'%',sep='')
## [1] "The accuracy of our series connection extrovert predicting model is 90.45%"
accuracy_introvert_ser <- mean(test$IE_introvert == test$predict_in_ser)
paste('The accuracy of our series connection introvert predicting model is ', round(accuracy_introvert_ser*100,digits=3),'%',sep='')
## [1] "The accuracy of our series connection introvert predicting model is 80.604%"
Generally, we prefer series models combining the two models. The predicted model has 90.5% accuracy for extrovert and 80.6% for introvert.